Differential expression using the bulk RNAseq JAX_RNAseq_ExtraEmbryonic data from the December 2023 release.

There are two model systems involved in this dataset, ExM & PrS.

There are also both normoxia and hypoxia samples involved in this dataset. Need to create another column for this information.

There is only one day value.

R libraries.

library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)

library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())

#dat_path = "~/research/data/MorPhiC/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = "../../MorPhiC/data/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = paste0(dat_path, "processed/Tables")

Read in meta data

meta = fread("data/December_2023/JAX_RNAseq_ExtraEmbryonic_meta_data.tsv", 
             data.table = FALSE)
dim(meta)
## [1] 90 32
meta[1:2,]
##   biomaterial_id lib_biomaterial_id differentiated_biomaterial_id
## 1    MOK20010-WT         GT23-10491               PrS-MOK20010-WT
## 2  MOK20010C-A06         GT23-10506             PrS-MOK20010C-A06
##                                                 differentiated_biomaterial_description
## 1                                                  KOLF2.2 derived primitive syncytium
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE) derived primitive syncytium
##   differentiation_protocol differentiated_timepoint_value
## 1                 JAXPD001                              6
## 2                 JAXPD001                              6
##   differentiated_timepoint_unit differentiated_terminally_differentiated
## 1                          days                                      Yes
## 2                          days                                      Yes
##                                 model_organ ko_strategy ko_gene
## 1 extra-embryonic primitive syncytial cells          WT      WT
## 2 extra-embryonic primitive syncytial cells          CE  POU2F3
##          library_preparation_protocol dissociation_protocol fragment_size
## 1 Protocol ID for library preparation                   N.A           431
## 2 Protocol ID for library preparation                   N.A           428
##   input_amount input_unit final_yield final_yield_unit lib_concentration
## 1          300        ngs         244              ngs              47.7
## 2          300        ngs         244              ngs              47.7
##   lib_concentration_unit PCR_cycle PCR_cycle_sample_index
## 1                     nM        10                     NA
## 2                     nM        10                     NA
##                                                file_id sequencing_protocol
## 1 KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001            JAXPS001
## 2  POU2F3_CE_B05_GT23-10506_GCAGAATT-TGGCCGGT_S17_L001            JAXPS001
##                        run_id
## 1 20230809_23-robson-005-run2
## 2 20230809_23-robson-005-run2
##                                    biomaterial_description
## 1                                                  KOLF2.2
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE)
##   derived_cell_line_accession clone_id protocol_id zygosity type model_system
## 1                    KOLF2.2J       WT         N.A      N.A iPSC          PrS
## 2                    KOLF2.2J      A06    JAXPE002      N.A iPSC          PrS
names(meta)
##  [1] "biomaterial_id"                          
##  [2] "lib_biomaterial_id"                      
##  [3] "differentiated_biomaterial_id"           
##  [4] "differentiated_biomaterial_description"  
##  [5] "differentiation_protocol"                
##  [6] "differentiated_timepoint_value"          
##  [7] "differentiated_timepoint_unit"           
##  [8] "differentiated_terminally_differentiated"
##  [9] "model_organ"                             
## [10] "ko_strategy"                             
## [11] "ko_gene"                                 
## [12] "library_preparation_protocol"            
## [13] "dissociation_protocol"                   
## [14] "fragment_size"                           
## [15] "input_amount"                            
## [16] "input_unit"                              
## [17] "final_yield"                             
## [18] "final_yield_unit"                        
## [19] "lib_concentration"                       
## [20] "lib_concentration_unit"                  
## [21] "PCR_cycle"                               
## [22] "PCR_cycle_sample_index"                  
## [23] "file_id"                                 
## [24] "sequencing_protocol"                     
## [25] "run_id"                                  
## [26] "biomaterial_description"                 
## [27] "derived_cell_line_accession"             
## [28] "clone_id"                                
## [29] "protocol_id"                             
## [30] "zygosity"                                
## [31] "type"                                    
## [32] "model_system"
table(table(meta$biomaterial_id))
## 
##  1  2 
## 66 12
table(table(meta$lib_biomaterial_id))
## 
##  1 
## 90
table(table(meta$differentiated_biomaterial_id))
## 
##  1 
## 90

Extract condition information

meta$differentiated_biomaterial_id[1:6]
## [1] "PrS-MOK20010-WT"   "PrS-MOK20010C-A06" "PrS-MOK20010C-B05"
## [4] "PrS-MOK20010C-B06" "PrS-MOK20010P-A09" "PrS-MOK20010P-A10"
cell_line_id = str_split(meta$differentiated_biomaterial_id, pattern = '-')
table(sapply(cell_line_id, length))
## 
##  3  4 
## 66 24
cell_line_id = lapply(cell_line_id, function(x) { length(x) <- 4; x})
cell_line_id = as.data.frame(do.call(rbind, cell_line_id))
cell_line_id[1:6, ]
##    V1        V2  V3   V4
## 1 PrS  MOK20010  WT <NA>
## 2 PrS MOK20010C A06 <NA>
## 3 PrS MOK20010C B05 <NA>
## 4 PrS MOK20010C B06 <NA>
## 5 PrS MOK20010P A09 <NA>
## 6 PrS MOK20010P A10 <NA>
meta$condition = cell_line_id[, 4]

meta$condition[which(is.na(meta$condition))] = "nor"
table(meta$condition, useNA="ifany")
## 
## hyp nor 
##  12  78
table(meta$model_organ, meta$condition, useNA="ifany")
##                                            
##                                             hyp nor
##   extra-embryonic mesenchymal cells           0  12
##   extra-embryonic primitive syncytial cells  12  66
table(meta$run_id, meta$condition, useNA="ifany")
##                              
##                               hyp nor
##   20230809_23-robson-005-run2   0  30
##   20230918_23-robson-008       12  11
##   20230918_23-robson-009        0  12
##   20230918_23-robson-010        0  12
##   20231004_23-robson-008-run2   0   1
##   20231004_23-robson-011        0  12

Manually update one run id

Run ID “20231004_23-robson-008-run2” is very similar to “20230918_23-robson-008” in experiment. It is merged into “20230918_23-robson-008”.

table(meta$run_id, useNA="ifany")
## 
## 20230809_23-robson-005-run2      20230918_23-robson-008 
##                          30                          23 
##      20230918_23-robson-009      20230918_23-robson-010 
##                          12                          12 
## 20231004_23-robson-008-run2      20231004_23-robson-011 
##                           1                          12
w2update = which(meta$run_id == "20231004_23-robson-008-run2")
meta$run_id[w2update] = "20230918_23-robson-008"

table(meta$run_id, meta$model_system, useNA="ifany")
##                              
##                               ExM PrS
##   20230809_23-robson-005-run2   0  30
##   20230918_23-robson-008        0  24
##   20230918_23-robson-009        0  12
##   20230918_23-robson-010        0  12
##   20231004_23-robson-011       12   0
table(meta$run_id, meta$condition, useNA="ifany")
##                              
##                               hyp nor
##   20230809_23-robson-005-run2   0  30
##   20230918_23-robson-008       12  12
##   20230918_23-robson-009        0  12
##   20230918_23-robson-010        0  12
##   20231004_23-robson-011        0  12
table(meta$model_system, meta$ko_gene, useNA="ifany")
##      
##       EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   ExM     0    0    0     0    9      0     0  3
##   PrS    18    9    9     9    0      9     9 15
table(meta$run_id, meta$ko_gene, useNA="ifany")
##                              
##                               EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   20230809_23-robson-005-run2     0    9    0     9    0      9     0  3
##   20230918_23-robson-008         18    0    0     0    0      0     0  6
##   20230918_23-robson-009          0    0    9     0    0      0     0  3
##   20230918_23-robson-010          0    0    0     0    0      0     9  3
##   20231004_23-robson-011          0    0    0     0    9      0     0  3
table(meta$ko_strategy, meta$ko_gene, useNA="ifany")
##      
##       EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   CE      6    3    3     3    3      3     3  0
##   KO      6    3    3     3    3      3     3  0
##   PTC     6    3    3     3    3      3     3  0
##   WT      0    0    0     0    0      0     0 18

Read in gene count data and filter genes

cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)
## [1] 38592    91
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]
##              Name POU2F3_KO_F04_GT23-10504_GGTACCTT-GACGTCTT_S33_L001
## 1 ENSG00000268674                                                   0
## 2 ENSG00000271254                                                1030
##   PTC_A09__Hypoxia_GT23-11309_AACGTTCC-AGTACTCC_S30_L001
## 1                                                      0
## 2                                                    489
##   PTC_D10__Hypoxia_GT23-11310_GCAGAATT-TGGCCGGT_S22_L001
## 1                                                      0
## 2                                                    603
stopifnot(setequal(names(cts)[-1], meta$file_id))

meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)
## 
## TRUE 
##   90
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name

table(meta$differentiated_timepoint_value, useNA="ifany")
## 
##  6 
## 90
meta$timepoint = as.character(meta$differentiated_timepoint_value)

Read in gene annoation

gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)
## [1] 62754     8
gene_anno[1:2,]
##                geneId    chr strand     start       end ensembl_gene_id
##                <char> <char> <char>     <int>     <int>          <char>
## 1: ENSG00000000003.16   chrX      - 100627108 100639991 ENSG00000000003
## 2:  ENSG00000000005.6   chrX      + 100584936 100599885 ENSG00000000005
##    hgnc_symbol                                       description
##         <char>                                            <char>
## 1:      TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2:        TNMD   tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(rownames(cts_dat) %in% gene_anno$ensembl_gene_id)
## 
##  TRUE 
## 38592
# all ensembl gene IDs are contained in the annotation
setdiff(rownames(cts_dat), gene_anno$ensembl_gene_id)
## character(0)

Discard genes whose expression is 0 in more than 75% of samples

model_s = meta$model_system
table(model_s, useNA="ifany")
## model_s
## ExM PrS 
##  12  78
get_q75 <- function(x){quantile(x, probs = 0.75)}

gene_info = data.frame(Name = cts$Name, 
                       t(apply(cts_dat, 1, tapply, model_s, min)), 
                       t(apply(cts_dat, 1, tapply, model_s, median)), 
                       t(apply(cts_dat, 1, tapply, model_s, get_q75)), 
                       t(apply(cts_dat, 1, tapply, model_s, max)))

dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM PrS ExM.1 PrS.1  ExM.2 PrS.2 ExM.3 PrS.3
## ENSG00000268674 ENSG00000268674   0   0   0.0   0.0   0.00     0     0     3
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25   923   948  1169
table(row.names(gene_info)==gene_info$Name, useNA="ifany")
## 
##  TRUE 
## 38592
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4), 
                               rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674       0       0        0.0        0.0    0.00
## ENSG00000271254 ENSG00000271254     634     387      812.5      775.5  848.25
##                 PrS_q75 ExM_max PrS_max
## ENSG00000268674       0       0       3
## ENSG00000271254     923     948    1169
summary(gene_info[,-1])
##     ExM_min            PrS_min           ExM_median         PrS_median      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0.0   Median :     0.0   Median :     2.0   Median :     1.0  
##  Mean   :   376.3   Mean   :   255.1   Mean   :   530.5   Mean   :   585.1  
##  3rd Qu.:   120.0   3rd Qu.:    60.0   3rd Qu.:   188.5   3rd Qu.:   182.5  
##  Max.   :512761.0   Max.   :154723.0   Max.   :797550.5   Max.   :379480.0  
##     ExM_q75            PrS_q75            ExM_max            PrS_max      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     1  
##  Median :     3.0   Median :     3.0   Median :     6.0   Median :    10  
##  Mean   :   608.8   Mean   :   712.7   Mean   :   751.6   Mean   :  1102  
##  3rd Qu.:   228.0   3rd Qu.:   233.8   3rd Qu.:   285.0   3rd Qu.:   405  
##  Max.   :886970.0   Max.   :456738.2   Max.   :928104.0   Max.   :801404
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)
##        
##         FALSE  TRUE
##   FALSE 12757   959
##   TRUE   2139 22737
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)
## [1] 25835    90
gene_info = gene_info[w2kp,]

meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)

ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "KO type", shape = "Model") + 
  scale_color_brewer(palette = "Set1")

meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")

ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = condition)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "Run ID", shape = "Condition") + 
  scale_color_brewer(palette = "Set1")

table(meta$model_system, meta$run_id_short)
##      
##       robson-005-run2 robson-008 robson-009 robson-010 robson-011
##   ExM               0          0          0          0         12
##   PrS              30         24         12         12          0
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = model_system)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "Run ID", shape = "Model") + 
  scale_color_brewer(palette = "Set1")

Check the effect of hypoxia vs. normoxia among WT

sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m    = meta[sample2kp,]
summary(meta_m$q75)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   361.0   422.2   469.8   478.4   540.5   592.0
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))

q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
## [1]    18 23865
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
## [1] 0.41336077 0.18367275 0.08208396 0.04567809 0.03608958
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
## [1] 41.3 18.4  8.2  4.6  3.6
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)

gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=model_system)) +
  geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
  labs(color="Model", shape ="Condition") + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) +
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=run_id_short)) +
  geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
  labs(color="Run id", shape ="Condition") + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) +
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

table(meta_m$run_id, meta_m$model_system)
##                              
##                               ExM PrS
##   20230809_23-robson-005-run2   0   3
##   20230918_23-robson-008        0   6
##   20230918_23-robson-009        0   3
##   20230918_23-robson-010        0   3
##   20231004_23-robson-011        3   0

DE analysis with respect to model system for WT samples

## Generate sample information matrix

cols2kp = c("model_system", "condition", "q75")
sample_info = meta_m[,cols2kp]

dim(sample_info)
## [1] 18  3
sample_info[1:2,]
##    model_system condition   q75
## 79          PrS       nor 569.5
## 12          PrS       hyp 545.0
sample_info$log_q75 = log(sample_info$q75)

dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info, 
                                  ~ model_system + condition + log_q75)
dds = DESeq(dds)

## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
## [1] 25835     6
res_rd[1:2,]
##                  baseMean log2FoldChange     lfcSE       stat    pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 0.04954726 0.9604832
## ENSG00000277196  46.50349      0.9817827 1.3388605 0.73329722 0.4633772
##                      padj
## ENSG00000271254 0.9997047
## ENSG00000277196 0.9997047
table(is.na(res_rd$padj))
## 
## FALSE  TRUE 
## 17751  8084
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("WT Read depth")
print(g0)

## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ condition + log_q75)
res_lrt = results(dds_lrt)

res_model_system = as.data.frame(res_lrt)
dim(res_model_system)
## [1] 25835     6
res_model_system[1:2,]
##                  baseMean log2FoldChange     lfcSE       stat     pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 4.06378556 0.04381218
## ENSG00000277196  46.50349      0.9817827 1.3388605 0.04552192 0.83104721
##                       padj
## ENSG00000271254 0.06908888
## ENSG00000277196 0.87097036
table(is.na(res_model_system$padj))
## 
## FALSE  TRUE 
## 23736  2099
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("WT Model system")
print(g0)

Analyze samples of different model systems or conditions

For the December 2023 release of JAX_RNAseq_ExtraEmbryonic, there are two model systems: ExM and PrS, and two conditions, nor and hyp.

There are three gene knock out strategies.

Explore the PC plots first, to decide which level to run DESeq2 on.

Then run the DE analysis.

Since there is only one day value for each model system, do not include day value in each scatter plot.

meta$model_condition = paste(meta$model_system, meta$condition, sep="_")
table(meta$model_condition)
## 
## ExM_nor PrS_hyp PrS_nor 
##      12      12      66
table(meta$run_id, meta$model_condition)
##                              
##                               ExM_nor PrS_hyp PrS_nor
##   20230809_23-robson-005-run2       0       0      30
##   20230918_23-robson-008            0      12      12
##   20230918_23-robson-009            0       0      12
##   20230918_23-robson-010            0       0      12
##   20231004_23-robson-011           12       0       0
unique_model_condition_ss = unique(meta$model_condition)
unique_model_condition_ss = sort(unique_model_condition_ss)

for(cur_mc in unique_model_condition_ss){
  
  print(cur_mc)
  sample2kp = which(meta$model_condition == cur_mc)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]

  print(table(meta_m$ko_strategy, str_extract(colnames(cts_dat_m), "^[^_]+_[^_]+")))
  extracted_ko_strings = str_extract_all(colnames(cts_dat_m), "CE|KO|PTC|WT")
  print(table(sapply(extracted_ko_strings, length)))
  print(table(meta_m$ko_strategy, unlist(extracted_ko_strings)))
  
  if (cur_mc == "PrS_nor"){
    print("The overlap between WT and KO is not real overlap. It is due to the gene name contains substring 'KO' in these three samples:")
    print(meta_m$file_id[(meta_m$ko_strategy=="WT") & (unlist(extracted_ko_strings)=="KO")])
  }
  
  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  q75 = apply(cts_dat_m, 1, quantile, probs=0.75)

  cts_dat_n = t(cts_dat_m[q75 > 0,])
  cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
  dim(cts_dat_n)
  
  sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
  names(sample_pca)
  pc_scores = sample_pca$x
  eigen_vals = sample_pca$sdev^2
  eigen_vals[1:5]/sum(eigen_vals)
  pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
  pvs
  
  PC_df = data.frame(pc_scores)
  PC_df = cbind(PC_df, meta_m)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(cur_mc) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(cur_mc) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene, 
                          alpha = ifelse(ko_gene == "WT", 1, 0.8))) +
    geom_point(size=2.5) + 
    scale_color_brewer(palette = "Dark2") + 
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(cur_mc) + guides(alpha = "none") + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  table(meta_m$run_id, meta_m$ko_gene)
  
}
## [1] "ExM_nor"
##      
##       ISL1_CE ISL1_KO ISL1_PTC ISL1_WT
##   CE        3       0        0       0
##   KO        0       3        0       0
##   PTC       0       0        3       0
##   WT        0       0        0       3
## 
##  1 
## 12 
##      
##       CE KO PTC WT
##   CE   3  0   0  0
##   KO   0  3   0  0
##   PTC  0  0   3  0
##   WT   0  0   0  3

## [1] "PrS_hyp"
##      
##       CE_E06 CE_G07 CE_H06 KO_A03 KO_B01 KO_B02 PTC_A09 PTC_D10 PTC_G09 WT_1
##   CE       1      1      1      0      0      0       0       0       0    0
##   KO       0      0      0      1      1      1       0       0       0    0
##   PTC      0      0      0      0      0      0       1       1       1    0
##   WT       0      0      0      0      0      0       0       0       0    1
##      
##       WT_2 WT_3
##   CE     0    0
##   KO     0    0
##   PTC    0    0
##   WT     1    1
## 
##  1 
## 12 
##      
##       CE KO PTC WT
##   CE   3  0   0  0
##   KO   0  3   0  0
##   PTC  0  0   3  0
##   WT   0  0   0  3

## [1] "PrS_nor"
##      
##       CE_E06 CE_G07 CE_H06 FOSB_CE FOSB_KO FOSB_PTC GCM1_CE GCM1_KO GCM1_PTC
##   CE       1      1      1       3       0        0       3       0        0
##   KO       0      0      0       0       3        0       0       3        0
##   PTC      0      0      0       0       0        3       0       0        3
##   WT       0      0      0       0       0        0       0       0        0
##      
##       GCM1_WT GHRL1_CE GHRL1_KO GHRL1_PTC KO_A03 KO_B01 KO_B02 KOLF2_FOSB
##   CE        0        3        0         0      0      0      0          0
##   KO        0        0        3         0      1      1      1          0
##   PTC       0        0        0         3      0      0      0          0
##   WT        3        0        0         0      0      0      0          1
##      
##       KOLF2_GHRL1 KOLF2_POU2F3 POU2F3_CE POU2F3_KO POU2F3_PTC PPARG_CE PPARG_KO
##   CE            0            0         3         0          0        3        0
##   KO            0            0         0         3          0        0        3
##   PTC           0            0         0         0          3        0        0
##   WT            1            1         0         0          0        0        0
##      
##       PPARG_PTC PPARG_WT PTC_A09 PTC_D10 PTC_G09 WT_1 WT_2 WT_3
##   CE          0        0       0       0       0    0    0    0
##   KO          0        0       0       0       0    0    0    0
##   PTC         3        0       1       1       1    0    0    0
##   WT          0        3       0       0       0    1    1    1
## 
##  1 
## 66 
##      
##       CE KO PTC WT
##   CE  18  0   0  0
##   KO   0 18   0  0
##   PTC  0  0  18  0
##   WT   0  3   0  9
## [1] "The overlap between WT and KO is not real overlap. It is due to the gene name contains substring 'KO' in these three samples:"
## [1] "KOLF2_GHRL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001"  
## [2] "KOLF2_FOSB_1_GT23-10493_GACCTGAA-CTCACCAA_S32_L001"  
## [3] "KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001"

Use each combination of (model system, condition) as DE group.

Run analysis for each DE group separately.

For the output files, create two versions, one with direct information, one with padj set to NA for the genes when the number of 0 per test is >=4.

In more details, for both versions of the outputs, make it one file per knockout gene per DE group (DE group can be model system, model system + condition, model system + run_id, model system + day, etc.). The first version of the output files contains gene id, and for each knockout strategy:

baseMean, log2FoldChange, lfcSE, stat, pvalue, padj

The second version of the output files contains gene id,

chr, strand, position, gene name

and for each knockout strategy:

log2FoldChange, pvalue, padj (set to be NA if the number of 0 per test >= 4), 

In addition, save out a separate file of the sample size for each comparison. This needs to be by DE group.

Prepare output directories for DE results

df_size = NULL

release_name = "2023_12_JAX_RNAseq_ExtraEmbryonic"

output_dir = file.path("results", release_name)
raw_output_dir = file.path(output_dir, "raw")
processed_output_dir = file.path(output_dir, "processed")

if (!file.exists(raw_output_dir)){
  dir.create(raw_output_dir, recursive = TRUE)
}

if (!file.exists(processed_output_dir)){
  dir.create(processed_output_dir, recursive = TRUE)
}

DE analysis for all genes

unique_model_ss = unique(meta$model_system)
unique_model_ss = sort(unique_model_ss)
unique_model_ss
## [1] "ExM" "PrS"
for (model1 in unique_model_ss){
  
  print(model1)
  sample2kp = which(meta$model_system == model1)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]

  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  print(table(meta_m$file_id == colnames(cts_dat_m), useNA="ifany"))  
  
  ## Generate sample information matrix
  cols2kp = c("model_system", "condition", "ko_strategy", "ko_gene", "run_id", 
              "run_id_short", "q75", "timepoint")
  sample_info = meta_m[,cols2kp]
  colnames(sample_info)[which(colnames(sample_info)=="timepoint")] = "day"
    
  dim(sample_info)
  sample_info[1:2,]
      
  print(table(sample_info$ko_strategy, sample_info$ko_gene))
  
  sample_info$ko_group = paste(sample_info$ko_gene, 
                               sample_info$ko_strategy, sep="_")
  print(table(sample_info$ko_group))
  
  # use the combination of (model system, condition) as DE group
  
  sample_info$model_day = paste0(sample_info$model_system, 
                                 "_day_", 
                                 as.character(sample_info$day))
    
  sample_info$de_grp = paste0(sample_info$model_day, "_", 
                              sample_info$condition)

  sorted_de_grps = sort(unique(sample_info$de_grp))
  sorted_de_grps

  for(d_group in sorted_de_grps){
    
    print(d_group)
    
    dg_index = which(sample_info$de_grp==d_group)
    cts_dat_m_dg = cts_dat_m[, dg_index]
    sample_info_dg = sample_info[dg_index, ]

    stopifnot(all(sample_info_dg$de_grp==d_group))   
    
    print("-----------------------------------")      
    print(paste0("Model system: ", model1))  
    print(sprintf("DE analysis group %s DESeq2 results:", d_group))
    print("-----------------------------------")  
      

    # do two steps of filtering
    # first, filter based on q75 of each gene
    # second, filter based on whether each gene is expressed in at least 4 cells
  
    dg_q75 = apply(cts_dat_m_dg, 1, quantile, probs=0.75)
    cts_dat_m_dg = cts_dat_m_dg[dg_q75 > 0,]
    
    print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d", 
                  length(dg_q75), nrow(cts_dat_m_dg)))
    
    n_pos = rowSums(cts_dat_m_dg>0)
    cts_dat_m_dg = cts_dat_m_dg[n_pos >= 4,]
    
    if (length(n_pos)==nrow(cts_dat_m_dg)){
      print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
    }else{
      print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d", 
                    length(n_pos), nrow(cts_dat_m_dg)))    
    }
  
    # update the q75 based on only genes used in the process
    sample_info_dg$q75 = apply(cts_dat_m_dg, 2, quantile, probs = 0.75)
    sample_info_dg$log_q75 = log(sample_info_dg$q75)   

    
    if (length(unique(sample_info_dg$run_id))==1){
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + log_q75)
    }else{
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + run_id + log_q75)     
    }
    
    start.time <- Sys.time()
    dds = DESeq(dds)
    end.time <- Sys.time()
    
    time.taken <- end.time - start.time
    print(time.taken)
    
    ## association with read-depth
    res_rd = results(dds, name = "log_q75")
    res_rd = as.data.frame(res_rd)
    dim(res_rd)
    res_rd[1:2,]
    
    table(is.na(res_rd$padj))
    
    g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
      geom_histogram(color="darkblue", fill="lightblue", 
                     breaks = seq(0,1,by=0.01)) + 
      ggtitle(paste0(d_group, " read depth"))
    print(g0)
    
    ## Rerun the analysis without read-depth if it is not significant for 
    ## most genes, and the number of samples is small
    ## i.e., proportion of non-null in the distribution is smaller than 0.01
    ## (this needs to be restricted to the genes with not NA adjusted pvalue)
    ## or if smaller than 0.1 and the number of samples is not greater than 6

    pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
    pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
    
    print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
    
    if(pi_1_rd < 0.01 || (ncol(cts_dat_m_dg) <= 6 && pi_1_rd < 0.1 )){
      
      print("rerun DESeq2 without read depth")
      
      include_rd = 0
      if (length(unique(sample_info_dg$run_id))==1){
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group)
      }else{
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group + run_id)     
      }
      start.time <- Sys.time()
      dds = DESeq(dds)
      end.time <- Sys.time()
      
      time.taken <- end.time - start.time
      print(time.taken)
      
    }else{
      include_rd = 1
    }
    
    
     ## association with run_id
    
    if (length(unique(sample_info_dg$run_id))>1){
      
      if(include_rd){
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
      }else{
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group)
      }
      
      res_lrt = results(dds_lrt)
    
      res_run_id = as.data.frame(res_lrt)
      dim(res_run_id)
      res_run_id[1:2,]
    
      table(is.na(res_run_id$padj))
      
      g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) + 
        geom_histogram(color="darkblue", fill="lightblue", 
                       breaks = seq(0,1,by=0.01)) + 
        ggtitle(paste0(d_group, " Run ID"))
      print(g0)
      
    }   
    
    ## DE analysis for each KO gene
    table(sample_info_dg$ko_gene)
    table(sample_info_dg$ko_gene, sample_info_dg$ko_strategy)
    
    genos = setdiff(unique(sample_info_dg$ko_gene), "WT")
    genos = sort(genos)
    genos
    
    ko_grp  = c("CE", "KO", "PTC")
      
    for(geno in genos){
      
      res_geno_df = NULL
      res_geno_reduced_df = NULL
      res_geno = list()
      
      for(k_grp1 in ko_grp){
        
        res_g1 = results(dds, contrast = c("ko_group", 
                                           paste0(geno, "_", k_grp1), "WT_WT"))
        res_g1 = signif(data.frame(res_g1), 3)
        
        # add a column to record the number of zeros per test
        test_index = which(sample_info_dg$ko_group%in%c(paste0(geno, "_", k_grp1), "WT_WT"))
        
        cts_dat_m_dg_test = cts_dat_m_dg[, test_index]
        n_zero = rowSums(cts_dat_m_dg_test==0)
        res_g1$n_zeros = n_zero
        
        # record the number of samples involved in the comparison
        test_ko_group_vec = sample_info_dg$ko_group[test_index]
        
        df_size = rbind(df_size, 
                        c(d_group, 
                          paste0(geno, "_", k_grp1), 
                          sum(test_ko_group_vec!="WT_WT"), 
                          sum(test_ko_group_vec=="WT_WT")))
        
        # prepare a processed version of output
        res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
        res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
        
        if ((sum(test_ko_group_vec!="WT_WT")==1)|(sum(test_ko_group_vec=="WT_WT")==1)){
          res_g1_reduced$padj = NA
        }
        
        res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
        
        res_geno[[k_grp1]] = res_g1_reduced
        
        if ((sum(test_ko_group_vec!="WT_WT")>1) & (sum(test_ko_group_vec=="WT_WT")>1)){
          g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) + 
            geom_histogram(color="darkblue", fill="lightblue", 
                           breaks = seq(0,1,by=0.01)) + 
            ggtitle(paste0(d_group, " ", geno, "_", k_grp1))
          print(g1)
        }

        
        tag1 = sprintf("%s_%s_", geno, k_grp1)
        colnames(res_g1) = paste0(tag1, colnames(res_g1))
        colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))      
        
        if(is.null(res_geno_df)){
          res_geno_df = res_g1
        }else if(!is.null(res_geno_df)){
          stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
          res_geno_df = cbind(res_geno_df, res_g1)
        }

        if(is.null(res_geno_reduced_df)){
          res_geno_reduced_df = res_g1_reduced
        }else if(!is.null(res_geno_reduced_df)){
          stopifnot(all(rownames(res_geno_reduced_df) == rownames(res_g1_reduced)))
          res_geno_reduced_df = cbind(res_geno_reduced_df, res_g1_reduced)
        }       

      }
      
      get_index <- function(x){
        which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
      }
      
      w_ce = get_index(res_geno[["CE"]])
      w_ko = get_index(res_geno[["KO"]])
      w_ptc = get_index(res_geno[["PTC"]])
 
      print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
      
      print(paste0("number of DE genes from knock out strategy CE: ", 
                   as.character(length(w_ce))))
      print(paste0("number of DE genes from knock out strategy KO: ", 
                   as.character(length(w_ko))))
      print(paste0("number of DE genes from knock out strategy PTC: ", 
                   as.character(length(w_ptc))))

      ce_ko_overlap = length(intersect(rownames(res_geno[["CE"]])[w_ce], 
                                       rownames(res_geno[["KO"]])[w_ko]))
      
      ko_ptc_overlap = length(intersect(rownames(res_geno[["KO"]])[w_ko], 
                                        rownames(res_geno[["PTC"]])[w_ptc]))
      
      ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc], 
                                        rownames(res_geno[["CE"]])[w_ce]))
      
      print(paste0("number of common DE genes by knock out strategies CE and KO: ", 
                   as.character(ce_ko_overlap)))
      print(paste0("number of common DE genes by knock out strategies KO and PTC: ", 
                   as.character(ko_ptc_overlap)))
      print(paste0("number of common DE genes by knock out strategies PTC and CE: ", 
                   as.character(ptc_ce_overlap)))

      if (max(length(w_ce), length(w_ko), length(w_ptc)) > 0){
        m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce], 
                               "KO" = rownames(res_geno[["KO"]])[w_ko], 
                               "PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
        g_up = UpSet(m)
        print(g_up)
      }
      
      df1 = data.frame(padj_CE = res_geno[["CE"]]$padj, 
                       padj_KO = res_geno[["KO"]]$padj, 
                       padj_PTC = res_geno[["PTC"]]$padj)
  
      cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
      cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
      
      g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                            y = -log10(padj_CE))) +
        geom_pointdensity() + 
        ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) + 
        scale_color_viridis()
      
      g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                            y = -log10(padj_KO))) +
        geom_pointdensity() + 
        ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr2)) + 
        scale_color_viridis()
      
      print(g4)
      print(g5)

    
      dim(res_geno_df)
      res_geno_df[1:2,]
      
      dim(res_geno_reduced_df)
      res_geno_reduced_df[1:2,]
      
      res_df = data.frame(gene_ID=rownames(res_geno_df), 
                          res_geno_df)
      dim(res_df)
      res_df[1:2,]
  
      res_reduced_gene_anno = gene_anno[match(rownames(res_geno_reduced_df), 
                                              gene_anno$ensembl_gene_id), ]
      stopifnot(all(rownames(res_geno_reduced_df)==res_reduced_gene_anno$ensembl_gene_id))
  
      # set gene hgnc_symbol that equal "" to NA
      hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
      hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
        
      res_reduced_df = data.frame(gene_ID=rownames(res_geno_reduced_df), 
                                  hgnc_symbol=hgnc_symbol_vec,
                                  chr=res_reduced_gene_anno$chr, 
                                  strand=res_reduced_gene_anno$strand, 
                                  start=res_reduced_gene_anno$start, 
                                  end=res_reduced_gene_anno$end, 
                                  res_geno_reduced_df)
      
      print("hgnc_symbol empty string and NA tables:")
      print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
      print(table(is.na(res_reduced_df$hgnc_symbol)))
        
      dim(res_reduced_df)
      res_reduced_df[1:2,]
  
      setting_name = unique(paste0(sample_info_dg$model_day, "_", 
                                   sample_info_dg$condition))
      
      fwrite(res_df, 
             sprintf("%s/%s_%s_%s_DEseq2_raw.tsv", 
                     raw_output_dir, release_name, setting_name, geno), 
             sep="\t")
      fwrite(res_reduced_df, 
             sprintf("%s/%s_%s_%s_DEseq2.tsv", 
                     processed_output_dir, release_name, setting_name, geno), 
             sep="\t")
    
    }
    
  }

}
## [1] "ExM"
## 
## TRUE 
##   12 
##      
##       ISL1 WT
##   CE     3  0
##   KO     3  0
##   PTC    3  0
##   WT     0  3
## 
##  ISL1_CE  ISL1_KO ISL1_PTC    WT_WT 
##        3        3        3        3 
## [1] "ExM_day_6_nor"
## [1] "-----------------------------------"
## [1] "Model system: ExM"
## [1] "DE analysis group ExM_day_6_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 24876"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 24876 to 23842"
## Time difference of 1.576037 mins

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## Time difference of 8.520416 secs

## [1] "DE group ExM_day_6_nor under KO gene ISL1:"
## [1] "number of DE genes from knock out strategy CE: 652"
## [1] "number of DE genes from knock out strategy KO: 405"
## [1] "number of DE genes from knock out strategy PTC: 432"
## [1] "number of common DE genes by knock out strategies CE and KO: 343"
## [1] "number of common DE genes by knock out strategies KO and PTC: 324"
## [1] "number of common DE genes by knock out strategies PTC and CE: 380"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18935  4907 
## 
## FALSE  TRUE 
## 18935  4907 
## [1] "PrS"
## 
## TRUE 
##   78 
##      
##       EPAS1 FOSB GCM1 GRHL1 POU2F3 PPARG WT
##   CE      6    3    3     3      3     3  0
##   KO      6    3    3     3      3     3  0
##   PTC     6    3    3     3      3     3  0
##   WT      0    0    0     0      0     0 15
## 
##   EPAS1_CE   EPAS1_KO  EPAS1_PTC    FOSB_CE    FOSB_KO   FOSB_PTC    GCM1_CE 
##          6          6          6          3          3          3          3 
##    GCM1_KO   GCM1_PTC   GRHL1_CE   GRHL1_KO  GRHL1_PTC  POU2F3_CE  POU2F3_KO 
##          3          3          3          3          3          3          3 
## POU2F3_PTC   PPARG_CE   PPARG_KO  PPARG_PTC      WT_WT 
##          3          3          3          3         15 
## [1] "PrS_day_6_hyp"
## [1] "-----------------------------------"
## [1] "Model system: PrS"
## [1] "DE analysis group PrS_day_6_hyp DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23911"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23911 to 23326"
## Time difference of 9.605665 secs

## [1] "prop of non-null for rd: 8.49e-02"

## [1] "DE group PrS_day_6_hyp under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 4196"
## [1] "number of DE genes from knock out strategy KO: 2140"
## [1] "number of DE genes from knock out strategy PTC: 2553"
## [1] "number of common DE genes by knock out strategies CE and KO: 1883"
## [1] "number of common DE genes by knock out strategies KO and PTC: 1628"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1960"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18654  4672 
## 
## FALSE  TRUE 
## 18654  4672 
## [1] "PrS_day_6_nor"
## [1] "-----------------------------------"
## [1] "Model system: PrS"
## [1] "DE analysis group PrS_day_6_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23493"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 1.409908 mins

## [1] "prop of non-null for rd: 1.30e-01"

## [1] "DE group PrS_day_6_nor under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 1372"
## [1] "number of DE genes from knock out strategy KO: 40"
## [1] "number of DE genes from knock out strategy PTC: 304"
## [1] "number of common DE genes by knock out strategies CE and KO: 37"
## [1] "number of common DE genes by knock out strategies KO and PTC: 29"
## [1] "number of common DE genes by knock out strategies PTC and CE: 236"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene FOSB:"
## [1] "number of DE genes from knock out strategy CE: 409"
## [1] "number of DE genes from knock out strategy KO: 847"
## [1] "number of DE genes from knock out strategy PTC: 121"
## [1] "number of common DE genes by knock out strategies CE and KO: 324"
## [1] "number of common DE genes by knock out strategies KO and PTC: 103"
## [1] "number of common DE genes by knock out strategies PTC and CE: 43"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene GCM1:"
## [1] "number of DE genes from knock out strategy CE: 5299"
## [1] "number of DE genes from knock out strategy KO: 4486"
## [1] "number of DE genes from knock out strategy PTC: 4255"
## [1] "number of common DE genes by knock out strategies CE and KO: 4013"
## [1] "number of common DE genes by knock out strategies KO and PTC: 3770"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3823"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene GRHL1:"
## [1] "number of DE genes from knock out strategy CE: 3439"
## [1] "number of DE genes from knock out strategy KO: 2483"
## [1] "number of DE genes from knock out strategy PTC: 5095"
## [1] "number of common DE genes by knock out strategies CE and KO: 1980"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2044"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3057"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene POU2F3:"
## [1] "number of DE genes from knock out strategy CE: 2038"
## [1] "number of DE genes from knock out strategy KO: 590"
## [1] "number of DE genes from knock out strategy PTC: 6"
## [1] "number of common DE genes by knock out strategies CE and KO: 462"
## [1] "number of common DE genes by knock out strategies KO and PTC: 4"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_day_6_nor under KO gene PPARG:"
## [1] "number of DE genes from knock out strategy CE: 4503"
## [1] "number of DE genes from knock out strategy KO: 3029"
## [1] "number of DE genes from knock out strategy PTC: 4014"
## [1] "number of common DE genes by knock out strategies CE and KO: 2690"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2334"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3173"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

Save the size dataframe out

colnames(df_size) = c("DE_group", "knockout_gene_strategy", "n_ko", "n_WT")
df_size = as.data.frame(df_size)

multi_rows = which(table(df_size$knockout_gene_strategy)>1)
multi_rows
##  EPAS1_CE  EPAS1_KO EPAS1_PTC 
##         1         2         3
df_size[which(df_size$knockout_gene_strategy%in%names(multi_rows)),]
##        DE_group knockout_gene_strategy n_ko n_WT
## 4 PrS_day_6_hyp               EPAS1_CE    3    3
## 5 PrS_day_6_hyp               EPAS1_KO    3    3
## 6 PrS_day_6_hyp              EPAS1_PTC    3    3
## 7 PrS_day_6_nor               EPAS1_CE    3   12
## 8 PrS_day_6_nor               EPAS1_KO    3   12
## 9 PrS_day_6_nor              EPAS1_PTC    3   12
fwrite(df_size, 
       sprintf("%s/%s_DE_n_samples.tsv", output_dir, release_name), 
       sep="\t")

DE analysis with respect to condition (hypoxia vs. normoxia)

This is only relevant for model system PrS.

Within model system PrS, for WT samples, only use the WT samples for EPAS1 gene for this analysis.

There are 24 samples involved. 6 WT and 6 EPAS1 knockout samples for each knockout strategy. Half of them are under normoxia and the other half are under hypoxia. All the 24 samples are from run_id “20230918_23-robson-008”.

For the 6 WT samples, we do not know whether they are from 3 single cell clones, with each clone having two groups of cells developed under different conditions. For now, run regular DESeq2 on them.

For the 6 knockout samples under each knockout strategy, they are from three single cell clones, with each clone having two groups of cells developed under different conditions. Run paired DESeq2 on them.

df_size = NULL

meta$ko_group = paste(meta$ko_gene, meta$ko_strategy, sep="_")
relevant_ko_groups = unique(meta$ko_group[which(meta$condition=="hyp")])
relevant_ko_groups = sort(relevant_ko_groups)

relevant_samples = meta$differentiated_biomaterial_id[which(meta$run_id=="20230918_23-robson-008")]
relevant_samples
##  [1] "PrS-MOK20012C-G07-nor" "PrS-MOK20012-WT1-hyp"  "PrS-MOK20012P-A09-nor"
##  [4] "PrS-MOK20012W-B02-hyp" "PrS-MOK20012-WT1-nor"  "PrS-MOK20012P-D10-nor"
##  [7] "PrS-MOK20012C-E06-nor" "PrS-MOK20012P-G09-hyp" "PrS-MOK20012-WT2-nor" 
## [10] "PrS-MOK20012C-E06-hyp" "PrS-MOK20012-WT3-hyp"  "PrS-MOK20012C-H06-hyp"
## [13] "PrS-MOK20012-WT2-hyp"  "PrS-MOK20012W-B01-nor" "PrS-MOK20012C-G07-hyp"
## [16] "PrS-MOK20012W-A03-nor" "PrS-MOK20012W-B02-nor" "PrS-MOK20012W-B01-hyp"
## [19] "PrS-MOK20012-WT3-nor"  "PrS-MOK20012C-H06-nor" "PrS-MOK20012W-A03-hyp"
## [22] "PrS-MOK20012P-G09-nor" "PrS-MOK20012P-A09-hyp" "PrS-MOK20012P-D10-hyp"
for (cur_ko_group in relevant_ko_groups){
  
  # note that gene_group is for indicating which knockout gene that WT samples are for
  # for knockout samples, its value is the knockout gene
  sample2kp = which((meta$differentiated_biomaterial_id%in%relevant_samples)&
                    (meta$ko_group == cur_ko_group))
  cts_dat_cond = cts_dat[,sample2kp]
  meta_cond    = meta[sample2kp,]
  
  dim(cts_dat_cond)
  dim(meta_cond)
  
  unique_model_day = unique(paste0(meta_cond$model_system, "_day_", 
                                   meta_cond$timepoint))
  print("")
  print("==========================================")
  print(paste0(unique_model_day, ", ", cur_ko_group, " samples"))
  print("hyp vs nor DE results:")
  print("==========================================")
  print("")
  
  stopifnot(all(meta_cond$file_id==colnames(cts_dat_cond)))
  table(meta_cond$file_id==colnames(cts_dat_cond), useNA="ifany")
  stopifnot(all(meta_cond$ko_group==cur_ko_group))
  
  print(table(meta_cond$condition, meta_cond$run_id_short))
  
  cols2kp = c("model_system", "timepoint", "biomaterial_id", "condition")
  sample_info_cond = meta_cond[,cols2kp]
  
  dim(sample_info_cond)
  sample_info_cond[1:2,]
  
  # do two steps of filtering
  # first, filter based on q75 of each gene
  # second, filter based on whether each gene is expressed in at least 4 cells
  
  cond_q75 = apply(cts_dat_cond, 1, quantile, probs=0.75)
  cts_dat_cond = cts_dat_cond[cond_q75 > 0,]
  
  print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d", 
                length(cond_q75), nrow(cts_dat_cond)))
  
  n_pos = rowSums(cts_dat_cond>0)
  cts_dat_cond = cts_dat_cond[n_pos >= 4,]
  
  if (length(n_pos)==nrow(cts_dat_cond)){
    print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
  }else{
    print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d", 
                  length(n_pos), nrow(cts_dat_cond)))    
  }
  
  # update the q75 based on only genes that will be used in the DE analysis
  sample_info_cond$q75 = apply(cts_dat_cond, 2, quantile, probs = 0.75)
  sample_info_cond$log_q75 = log(sample_info_cond$q75)
  
  if (cur_ko_group=="WT_WT"){
    dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond, 
                                  ~ condition + log_q75)
  }else{
    dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond, 
                                  ~ biomaterial_id + log_q75 + condition)     
  }
  
  start.time <- Sys.time()
  dds = DESeq(dds)
  end.time <- Sys.time()
  
  time.taken <- end.time - start.time
  print(time.taken)
  
  ## association with read-depth
  res_rd = results(dds, name = "log_q75")
  res_rd = as.data.frame(res_rd)
  dim(res_rd)
  res_rd[1:2,]
  
  table(is.na(res_rd$padj))
  
  g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
    geom_histogram(color="darkblue", fill="lightblue", 
                   breaks = seq(0,1,by=0.01)) + 
    ggtitle(paste0(unique_model_day, " ", cur_ko_group, " hyp vs nor\nread depth"))
  print(g0)
  
  ## Rerun the analysis without read-depth if it is not significant for 
  ## most genes, and the number of samples is small
  ## i.e., proportion of non-null in the distribution is smaller than 0.01
  ## (this needs to be restricted to the genes with not NA adjusted pvalue)
  ## or if smaller than 0.1 and the number of samples is not greater than 6
  
  pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
  pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
  
  print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
  
  if(pi_1_rd < 0.01 || (ncol(cts_dat_cond) <= 6 && pi_1_rd < 0.1 )){
    print("rerun DESeq2 without read depth")
    if (cur_ko_group=="WT_WT"){
      dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond, 
                                    ~ condition)
    }else{
      dds = DESeqDataSetFromMatrix(cts_dat_cond, colData=sample_info_cond, 
                                    ~ biomaterial_id + condition)  
    }
    dds = DESeq(dds)
  }
  
  ## DE analysis between two conditions
  res_g1 = results(dds, contrast = c("condition", "hyp", "nor"))
  res_g1 = signif(data.frame(res_g1), 3)
  
  # add a column to record the number of zeros per test
  n_zero = rowSums(cts_dat_cond==0)
  res_g1$n_zeros = n_zero    
  
  # record the number of samples involved in the comparison
  df_size = rbind(df_size, 
                  c(unique_model_day, 
                    cur_ko_group, 
                    sum(sample_info_cond$condition=="hyp"), 
                    sum(sample_info_cond$condition=="nor")))   
    
  # prepare a processed version of output
  res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
  res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
  
  res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]      
  
  n_DE = sum(res_g1_reduced$padj < 0.05 & abs(res_g1_reduced$log2FoldChange) > log2(1.5), 
             na.rm = TRUE)
  print(paste0(unique_model_day, " between nor and hyp conditions, ", cur_ko_group, ":"))
  print(sprintf("number of DE genes: %d", n_DE))
  
  g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) + 
    geom_histogram(color="darkblue", fill="lightblue", 
                   breaks = seq(0,1,by=0.01)) + 
    ggtitle(paste0(unique_model_day, " ", cur_ko_group, "\nhyp vs nor"))
  print(g1)
  
  tag1 = "hyp_vs_nor_"
  colnames(res_g1) = paste0(tag1, colnames(res_g1))
  colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))
  
  
  res_df = data.frame(gene_ID=rownames(res_g1), 
                      res_g1)
  dim(res_df)
  res_df[1:2,]
    
    
  res_reduced_gene_anno = gene_anno[match(rownames(res_g1_reduced), 
                                          gene_anno$ensembl_gene_id), ]
  stopifnot(all(rownames(res_g1_reduced)==res_reduced_gene_anno$ensembl_gene_id))     
    
  # set gene hgnc_symbol that equal "" to NA
  hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
  hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
    
  res_reduced_df = data.frame(gene_ID=rownames(res_g1_reduced), 
                              hgnc_symbol=hgnc_symbol_vec,
                              chr=res_reduced_gene_anno$chr, 
                              strand=res_reduced_gene_anno$strand, 
                              start=res_reduced_gene_anno$start, 
                              end=res_reduced_gene_anno$end, 
                              res_g1_reduced)
  
  print("hgnc_symbol empty string and NA tables:")
  print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
  print(table(is.na(res_reduced_df$hgnc_symbol)))
      
  dim(res_reduced_df)
  res_reduced_df[1:2,]      
    
  fwrite(res_df, 
         sprintf("%s/%s_%s_%s_DEseq2_raw.tsv", 
                 raw_output_dir, release_name, 
                 paste0(unique_model_day, "_", cur_ko_group), "hyp_vs_nor"), 
         sep="\t")
  fwrite(res_reduced_df, 
         sprintf("%s/%s_%s_%s_DEseq2.tsv", 
                 processed_output_dir, release_name, 
                 paste0(unique_model_day, "_", cur_ko_group), "hyp_vs_nor"), 
         sep="\t")  

}
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, EPAS1_CE samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##      
##       robson-008
##   hyp          3
##   nor          3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23378"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23378 to 20937"
## Time difference of 30.83262 secs

## [1] "prop of non-null for rd: 4.29e-01"
## [1] "PrS_day_6 between nor and hyp conditions, EPAS1_CE:"
## [1] "number of DE genes: 3518"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 17474  3463 
## 
## FALSE  TRUE 
## 17474  3463 
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, EPAS1_KO samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##      
##       robson-008
##   hyp          3
##   nor          3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23542"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23542 to 21177"
## Time difference of 1.093934 mins

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "PrS_day_6 between nor and hyp conditions, EPAS1_KO:"
## [1] "number of DE genes: 2518"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 17534  3643 
## 
## FALSE  TRUE 
## 17534  3643 
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, EPAS1_PTC samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##      
##       robson-008
##   hyp          3
##   nor          3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23382"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23382 to 21031"
## Time difference of 53.65962 secs

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "PrS_day_6 between nor and hyp conditions, EPAS1_PTC:"
## [1] "number of DE genes: 3664"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 17464  3567 
## 
## FALSE  TRUE 
## 17464  3567 
## [1] ""
## [1] "=========================================="
## [1] "PrS_day_6, WT_WT samples"
## [1] "hyp vs nor DE results:"
## [1] "=========================================="
## [1] ""
##      
##       robson-008
##   hyp          3
##   nor          3
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23704"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23704 to 21221"
## Time difference of 9.307755 secs

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## [1] "PrS_day_6 between nor and hyp conditions, WT_WT:"
## [1] "number of DE genes: 6724"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 17531  3690 
## 
## FALSE  TRUE 
## 17531  3690
colnames(df_size) = c("model_system_day", "knockout_gene_strategy", "n_hyp", "n_nor")
fwrite(df_size, 
       sprintf("%s/%s_DE_hyp_vs_nor_n_samples.tsv", output_dir, release_name), 
       sep="\t")
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  8143440 435.0   12622092 674.1         NA 12622092 674.1
## Vcells 34189957 260.9   96788224 738.5      65536 96788224 738.5
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] readxl_1.4.3                UpSetR_1.4.0               
##  [3] ComplexHeatmap_2.14.0       data.table_1.15.4          
##  [5] dplyr_1.1.4                 ggpointdensity_0.1.0       
##  [7] viridis_0.6.5               viridisLite_0.4.2          
##  [9] DESeq2_1.38.3               SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0              MatrixGenerics_1.10.0      
## [13] matrixStats_1.3.0           GenomicRanges_1.50.2       
## [15] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [17] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [19] RColorBrewer_1.1-3          MASS_7.3-60                
## [21] stringr_1.5.1               ggpubr_0.6.0               
## [23] ggrepel_0.9.5               ggplot2_3.5.1              
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-8           bit64_4.0.5            doParallel_1.0.17     
##  [4] httr_1.4.7             tools_4.2.3            backports_1.5.0       
##  [7] bslib_0.8.0            utf8_1.2.4             R6_2.5.1              
## [10] DBI_1.2.3              colorspace_2.1-1       GetoptLong_1.0.5      
## [13] withr_3.0.1            tidyselect_1.2.1       gridExtra_2.3         
## [16] bit_4.0.5              compiler_4.2.3         cli_3.6.3             
## [19] Cairo_1.6-2            DelayedArray_0.24.0    labeling_0.4.3        
## [22] sass_0.4.9             scales_1.3.0           digest_0.6.37         
## [25] rmarkdown_2.28         XVector_0.38.0         pkgconfig_2.0.3       
## [28] htmltools_0.5.8.1      highr_0.11             fastmap_1.2.0         
## [31] GlobalOptions_0.1.2    rlang_1.1.4            rstudioapi_0.16.0     
## [34] RSQLite_2.3.7          farver_2.1.2           shape_1.4.6.1         
## [37] jquerylib_0.1.4        generics_0.1.3         jsonlite_1.8.8        
## [40] BiocParallel_1.32.6    car_3.1-2              RCurl_1.98-1.16       
## [43] magrittr_2.0.3         GenomeInfoDbData_1.2.9 Matrix_1.5-4.1        
## [46] Rcpp_1.0.13            munsell_0.5.1          fansi_1.0.6           
## [49] abind_1.4-5            lifecycle_1.0.4        stringi_1.8.4         
## [52] yaml_2.3.10            carData_3.0-5          zlibbioc_1.44.0       
## [55] plyr_1.8.9             blob_1.2.4             parallel_4.2.3        
## [58] crayon_1.5.3           lattice_0.22-6         Biostrings_2.66.0     
## [61] annotate_1.76.0        circlize_0.4.16        KEGGREST_1.38.0       
## [64] locfit_1.5-9.10        knitr_1.48             pillar_1.9.0          
## [67] rjson_0.2.21           ggsignif_0.6.4         geneplotter_1.76.0    
## [70] codetools_0.2-20       XML_3.99-0.17          glue_1.7.0            
## [73] evaluate_0.24.0        foreach_1.5.2          vctrs_0.6.5           
## [76] png_0.1-8              cellranger_1.1.0       gtable_0.3.5          
## [79] purrr_1.0.2            tidyr_1.3.1            clue_0.3-65           
## [82] cachem_1.1.0           xfun_0.47              xtable_1.8-4          
## [85] broom_1.0.6            rstatix_0.7.2          tibble_3.2.1          
## [88] iterators_1.0.14       AnnotationDbi_1.60.2   memoise_2.0.1         
## [91] cluster_2.1.6